library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.0 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.1
[32m✔[30m [34mtidyr [30m 0.8.3.[31m9000[30m [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0 [39m
package ‘ggplot2’ was built under R version 3.5.2package ‘tibble’ was built under R version 3.5.2package ‘purrr’ was built under R version 3.5.2package ‘dplyr’ was built under R version 3.5.2package ‘stringr’ was built under R version 3.5.2[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
Warning message:
package ‘h2o’ was built under R version 3.5.2
library(timetk)
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:h2o’:
day, hour, month, week, year
The following object is masked from ‘package:base’:
date
library(plotly)
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
### NOTE if you import fpp2 second you can use its autoplot() method. I like the ggfortify one better so I will import it last os that that one is used
library(fpp2)
Loading required package: forecast
Attaching package: ‘forecast’
The following object is masked from ‘package:ggplot2’:
autolayer
Loading required package: fma
Loading required package: expsmooth
replacing previous import ‘forecast::autolayer’ by ‘ggplot2::autolayer’ when loading ‘fpp2’
library(ggfortify)
source('module.r')
# this suppresses scientific
options(scipen = 999)
dataframe = readRDS('seatleBike.rds')
df_day = dataframe %>%
select(ds = Date, y = value) %>%
mutate(ds = ymd(ds)) %>%
filter(ds > ymd(20140101)) %>%
group_by(ds) %>%
summarise(y = sum(y, na.rm = T)) %>%
# most recent weeek will rarely ever be complete, we don't want to add an incomplete week to our series because it's sum will obviously be lower than it should be
head(-1)
train = df_day %>% filter(ds < ymd("20180101")) %>% convert()
Non-numeric columns being dropped: ds
test = df_day %>% filter(ds >= ymd("20180101")) %>% convert()
Non-numeric columns being dropped: ds
figure1 = df_day %>%
ggplot(aes(ds, y))+
geom_line()+
geom_line(aes(text = paste("Date: ", ds, "\nObserved Value =", y)))+
theme_minimal()+
geom_smooth(se = F)+
labs(title = "Daily Bicycle Traffic in Seatle", x = "Date", y = "Count Of Bikers Detected")
Ignoring unknown aesthetics: text
ggplotly(figure1, tooltip = "text")
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
p = ggAcf(df_day %>% convert()) + theme_light() + labs(title = "ACF plot of Seattle Bikes Series")
Non-numeric columns being dropped: dsNon-numeric columns being dropped: ds
p
Lets start off with some simple
When we have this kind of auto-correlation pattern we can typically do quite well by just guessing the value observed the previous year. We can achieve this with the snaive() function, which creates a seasonal naive model.
naiv = train %>%
snaive(h=length(test), bootstrap = T, level = 0.89)
naiv %>%
forecast_eval(model_name = "SNaive")
naiv %>%
forecast(length(test))%>%
accuracy(test)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set -1404.631 66982.97 35365.62 -7.709477 24.45164 1.000000 0.6055990 NA
Test set 43208.579 103535.43 56953.55 16.118496 29.46728 1.610421 0.6251249 1.306791
snaive:
\(\hat{y}_{T+h|T} = y_{T+h-m(k+1)}\)
\(\hat{y}_{T}\) = current period
h = forecast horizon
h|t = forecasting h steps ahead from time t
m = seasonality
k = floor((h−1) / m) or the integer part of (h−1) / m
naiv %>%
checkresiduals()
Ljung-Box test
data: Residuals from Seasonal naive method
Q* = 207, df = 104, p-value = 0.000000008122
Model df: 0. Total lags used: 104
ETS = train %>%
ets() %>%
forecast(h = length(test), bootstrap = T, level = 0.89, lamda = 'auto')
I can't handle data with frequency greater than 24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
ETS %>%
forecast_eval('ETS')
ETS %>%
forecast(length(test), bootstrap = T) %>%
accuracy(test)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set -2024.736 41104.76 24730.16 -5.635733 19.08057 0.6992712 -0.04760931 NA
Test set 34786.585 115266.52 65901.77 -2.416944 40.94952 1.8634415 0.79147296 1.815072
If I don’t bootstrap I get extremely unreasonable intervals.
ETS = train %>%
ets() %>%
forecast(h = length(test), bootstrap = F)
I can't handle data with frequency greater than 24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
ETS %>%
forecast_eval()
ETS = train %>%
stlf() %>%
forecast(h = length(test), bootstrap = T, lambda = 'auto')
ETS %>%
forecast_eval()
BATS = train %>%
bats()
BATS_for = BATS %>%
forecast(h = length(test), bootstrap = T)
BATS_for %>%
forecast_eval(model_name = "bats")
BATS = train %>%
bats()
BATS_for = BATS %>%
forecast(h = length(test), bootstrap = F)
BATS_for %>%
forecast_eval(model_name = "bats")
BATS %>%
plot()
BATS %>%
forecast(length(test)) %>%
accuracy(test)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 2979.105 30498.29 16711.84 -0.3212479 11.6302 0.4725448 0.2913184 NA
Test set 35283.412 95291.34 49158.74 9.0475831 26.3837 1.3900148 0.6199247 1.10886
TBATS = train %>%
tbats()
TBATS_for = TBATS %>%
forecast(h = length(test))
TBATS_for %>%
forecast_eval("Tbats")
TBATS %>%
forecast(length(test)) %>%
accuracy(test)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 5719.533 37568.64 20331.15 -0.3098929 14.25198 0.5748845 0.3234638 NA
Test set 38994.871 93390.63 48546.90 11.9215730 26.56085 1.3727144 0.6549402 1.11506
nnet = train %>%
nnetar() %>%
forecast(h = length(test), PI = T, bootstrap = T)
nnet %>%
forecast_eval("Ar Neural Network")
aarim = train %>%
auto.arima()
aarim %>%
forecast(h = length(test),Bootstrap = T) %>%
forecast_eval(model_name = "Arima")
The non-existent Bootstrap arguments will be ignored.
summary(aarim)
Series: .
ARIMA(1,0,0)(1,1,0)[52] with drift
Coefficients:
ar1 sar1 drift
0.6122 -0.5240 -79.8268
s.e. 0.0625 0.0729 131.9496
sigma^2 estimated as 2078698951: log likelihood=-1914.06
AIC=3836.12 AICc=3836.38 BIC=3848.34
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 762.3817 39136.59 21273.77 -3.187623 15.87119 0.6015382 -0.003445345
Alright so our top performing classical forcasting model is tbats. This comes as no suprise, tbats typically performs quite strong on weekly data. Let’s zoom in on just the point estimates themselves. I’ve added a loess smoother to visualize the signal that was extracted.
library(sweep)
tbats_frame = TBATS_for %>% sw_sweep() %>%
select(y, key, lo.95, hi.95) %>%
bind_cols(df_day %>% select(ds, ytrue = y)) %>%
filter(ds >= ymd("20180101"))
tbats_plot =tbats_frame %>%
ggplot(aes(x = ds))+
geom_point(aes(y = ytrue), color = "black")+
geom_point(aes(y = y), color = "orange", alpha = 0.6)+
#geom_line(aes(y = y), color = "light blue")+
geom_ribbon(aes(ymin=lo.95, ymax=hi.95), fill = 'grey', color = "light grey", alpha = 0.2)+
#geom_smooth(aes(y =y), color = "light blue", method = "loess")+
theme_minimal()+
labs(title = "TBATS Fit on Test Set", subtitle = "Test set values black, point estimate orange, 89% Prediction Interval Orange", y = "Count Of Bikers Recorded", x = "Week")
tbats_plot
Data at the end is noisy because it’s imputed, badly to quite honest. Signal might be closer to true biker count.
dataframe %>% group_by(key) %>% summarise(most_recent = max(Date), start = min(Date)) %>% arrange(most_recent)
# shoutout to business science for continually putting out amazing packages for the community like anomalize
library(anomalize)
decomp_feats = df_day %>%
decompose_stl(target = y) %>%
anomalize(target = remainder, alpha = 0.2) %>%
mutate(anomaly = factor(anomaly, levels = c("Yes", "No"))) %>%
as.tibble()
frequency = 13 weeks
trend = 52 weeks
`as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
[90mThis warning is displayed once per session.[39m
Dates = tibble(ds = seq(min(df_day$ds), max(df_day$ds), "week"))
time_df = df_day %>%
right_join(Dates) %>%
left_join(decomp_feats) %>%
fill() %>%
mutate(month = month(ds), year = year(ds))
Joining, by = "ds"
Joining, by = "ds"
time_df = time_df %>%
mutate(target = lead(y, 1), lag2 = lag(y,1), lag3 = lag(y, 2),lag4 = lag(y, 3),
lag5 = lag(y, 4), ma3 = (y + lag2 + lag3)/3, ma4 = (y + lag2 + lag3 + lag4)/4,
ma5 = (y + lag2 + lag3 + lag4 + lag5)/ 5, summer = if_else(month %in% c(5, 6, 7, 8), 1, 0),
summer = factor(summer, ordered = F, levels = c(0,1))) %>% drop_na() %>% select(-y)
last_obs = time_df %>% tail(1)
time_df = time_df %>% head(-1) %>%
select_if(~ !any(is.na(.))) %>%
mutate_if(is.ordered, ~ as.character(.) %>% as.factor)
time_df = time_df %>% drop_na()
train = time_df %>% filter(year < 2017)
valid = time_df %>% filter(year == 2017)
test = time_df %>% filter(year > 2017)
NOTE TO SELF: do not move h2o import, masks the lubridate month function and creates big problems
library(h2o)
h2o.init()
H2O is not running yet, starting it now...
Note: In case of errors look at the following log files:
/var/folders/zp/2f6snk2x11ndcnx60k_d7scm0000gn/T//RtmpSPYj8z/h2o_bryant_started_from_r.out
/var/folders/zp/2f6snk2x11ndcnx60k_d7scm0000gn/T//RtmpSPYj8z/h2o_bryant_started_from_r.err
java version "9.0.1"
Java(TM) SE Runtime Environment (build 9.0.1+11)
Java HotSpot(TM) 64-Bit Server VM (build 9.0.1+11, mixed mode)
Starting H2O JVM and connecting: ....... Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 2 seconds 300 milliseconds
H2O cluster timezone: America/New_York
H2O data parsing timezone: UTC
H2O cluster version: 3.22.1.1
H2O cluster version age: 6 months and 10 days !!!
H2O cluster name: H2O_started_from_R_bryant_oez965
H2O cluster total nodes: 1
H2O cluster total memory: 4.00 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
R Version: R version 3.5.0 (2018-04-23)
Your H2O cluster version is too old (6 months and 10 days)!
Please download and install the latest version from http://h2o.ai/download/
h2o.no_progress()
train_h2o <- as.h2o(train %>% select(-ds))
valid_h2o <- as.h2o(valid %>% select(-ds))
test_h2o <- as.h2o(test %>% select(-ds))
This is how the lag and lead functions work. I wrote this quick check to confirm they worked the way I thought they did.
tibble(test = c(1, 2, 3,4, 5)) %>%
mutate(lag_test = lag(test, 1), lead_test = lead(test, 1))
library(xgboost)
package ‘xgboost’ was built under R version 3.5.2
Attaching package: ‘xgboost’
The following object is masked from ‘package:plotly’:
slice
The following object is masked from ‘package:dplyr’:
slice
# Set names for h2o
y <- "target"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60 * 60,
nfold = 0,
# none of these will do well trust me
# gbm and xgboost are very similar, thats why
# gbm is left out
exclude_algos = c("GBM","DRF", "GLM"),
stopping_metric = "MSE",
sort_metric = "MSE")
There are NA factor levels being included in the feature importances returned by the function. I’m honestly not sure why, as we can see there are no missing values in any of the sets. The feature importances for theses features is 0. This may simply be an indication that there are no missing values in the object. That’s what I’ll assume fore now.
h2o.nacnt(train_h2o)
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
h2o.nacnt(valid_h2o)
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
h2o.nacnt(test_h2o)
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I don’t want the NA feature importances which are all 0, so I am going to filter them out before plotting.
importance = h2o.varimp(automl_leader)
importance %>%
filter(!str_detect(variable, "NA")) %>%
ggplot(aes(reorder(variable, -scaled_importance), scaled_importance))+
geom_col()+
coord_flip()+
theme_minimal()+
labs(x = "", y= "Scaled Importance")
The h2o plot does not include the missing feature value importances, which likely confirms my previous suspicion.
h2o.varimp_plot(automl_leader)
options(scipen = 999)
h2o.partialPlot(automl_leader, test_h2o, 'anomaly')
PartialDependence: Partial Dependence Plot of model DeepLearning_grid_1_AutoML_20190709_133414_model_56 on column 'anomaly'
automl_models_h2o
An object of class "H2OAutoML"
Slot "project_name":
[1] "automl_data.frame_sid_8069_1"
Slot "leader":
Model Details:
==============
H2ORegressionModel: deeplearning
Model ID: DeepLearning_grid_1_AutoML_20190709_133414_model_56
Status of Neuron Layers: predicting target, regression, gaussian distribution, Quadratic loss, 10,501 weights/biases, 133.9 KB, 1,123,280 training samples, mini-batch size 1
H2ORegressionMetrics: deeplearning
** Reported on training data. **
** Metrics reported on full training frame **
MSE: 1490746284
RMSE: 38610.18
MAE: 25427.19
RMSLE: 0.228172
Mean Residual Deviance : 1490746284
H2ORegressionMetrics: deeplearning
** Reported on validation data. **
** Metrics reported on full validation frame **
MSE: 1670375929
RMSE: 40870.23
MAE: 24883.25
RMSLE: 0.2863995
Mean Residual Deviance : 1670375929
Slot "leaderboard":
[102 rows x 6 columns]
error_tbl = time_df %>%
filter(lubridate::year(ds) > 2017) %>%
add_column(pred = pred_h2o %>% as.tibble() %>% pull(predict)) %>%
rename(actual = y) %>%
mutate(
error = actual - pred,
error_pct = error / actual
)
time_df %>%
ggplot(aes(ds, observed))+
geom_point(color = "dark grey")+
geom_point(aes(ds, pred), data = error_tbl, color = "light blue")+
geom_smooth(aes(ds, pred), data = error_tbl, color = "light blue", method = "loess")+
labs(title = "Auto ML Model Fit on Test Set")+
theme_minimal()+
labs(x = "Date", y = "Bike Traffic")
sweeped = BATS_for %>% sw_sweep() %>%
select(y, key) %>%
bind_cols(df_day %>% select(ds, ytrue = y)) %>%
filter(ds >= ymd("20180101"))
figure2 = time_df %>%
ggplot(aes(ds, observed))+
geom_point(color = "dark grey")+
geom_point(aes(y = y), color = "orange", data = sweeped)+
geom_smooth(aes(y =y), color = "orange", method = "loess", data =sweeped, se = F)+
geom_point(aes(ds, pred), data = error_tbl, color = "light blue")+
geom_smooth(aes(ds, pred), data = error_tbl, color = "light blue", method = "loess", se = F)+
labs(title = "Auto ML Model Fit on Test Set", subtitle = "Blue = Auto ML, Orange = TBATS")+
theme_minimal()+
labs(x = "Date", y = "Bike Traffic")
figure2
figure2 = time_df %>%
filter(ds >= ymd("20180101")) %>%
ggplot(aes(ds, observed))+
geom_point(color = "dark grey")+
geom_point(aes(y = y), color = "orange", data = sweeped)+
geom_smooth(aes(y =y), color = "orange", method = "loess", data =sweeped, se = F)+
geom_point(aes(ds, pred), data = error_tbl, color = "light blue")+
geom_smooth(aes(ds, pred), data = error_tbl, color = "light blue", method = "loess", se = F)+
labs(title = "Model Fit comparison", subtitle = "Blue = Auto ML, Orange = TBATS, Black = Test Set")+
theme_minimal()+
labs(x = "Date", y = "Bike Traffic")
figure2
comparing our classical method back to back with the ML model we can see a clear trend. The ML based model does a much better job of predicting the extreme events than the tbats algorithm.
error_tbl %>%
summarise(
me = mean(error),
rmse = mean(error^2)^0.5,
mae = mean(abs(error)),
mape = mean(abs(error_pct)),
mpe = mean(error_pct)
) %>%
glimpse()
Observations: 1
Variables: 5
$ me [3m[38;5;246m<dbl>[39m[23m -3910.108
$ rmse [3m[38;5;246m<dbl>[39m[23m 69900.04
$ mae [3m[38;5;246m<dbl>[39m[23m 40934.89
$ mape [3m[38;5;246m<dbl>[39m[23m 0.2712599
$ mpe [3m[38;5;246m<dbl>[39m[23m -0.1382674